In [1]:
#### Visual Analytics Coursework ####
In [2]:
# Import the Required Modules
import time
start = time.time()
In [3]:
# Set matplotlib to plot in the notebook
%pylab inline

import pandas as pd
import numpy as np
from __future__ import division
import utils
import seaborn
from six.moves import zip
from sklearn import preprocessing
from rpy2.robjects.packages import importr

# Set the Default Seaborn Colours
seaborn.set()
colors = seaborn.color_palette()
seaborn.set_context(rc={"figure.figsize": (12, 12)})

# Turn off Pandas Future Warnings
pd.set_option('chained_assignment',None)
Populating the interactive namespace from numpy and matplotlib

In [4]:
from collections import OrderedDict

import numpy as np
from bokeh.charts import Histogram
from bokeh.plotting import *

output_notebook()
BokehJS successfully loaded.
In [5]:
# Import the Data - Using Pandas
In [6]:
Data_CF = pd.read_csv('Crossfit_Open_2011_Dataset.csv', 
                      sep= ',')
In [7]:
# Check the Import for Errors
In [8]:
print(Data_CF.head(5))
   athlete_ID                    nameURL First_Name Last_Name  \
0       47717       /athletes/dan-bailey        Dan    Bailey   
1       16400   /athletes/joshua-bridges     Joshua   Bridges   
2       58666     /athletes/rich-froning       Rich   Froning   
3      169510       /athletes/mikko-salo      Mikko      Salo   
4       12005  /athletes/austin-malleolo     Austin  Malleolo   

                Region  age sex&division Gender height  Height_cm  ...    \
0         Central East   26           M0      M    NaN        NaN  ...     
1  Southern California   27           M0      M  5' 5"      165.1  ...     
2         Central East   23           M0      M    NaN        NaN  ...     
3               Europe   31           M0      M    NaN        NaN  ...     
4            Northeast   23           M0      M  5' 5"      165.1  ...     

   score2   rank2   score3  rank3   score4  rank4   score5  rank5   score6  \
0     575       5       82      9      162      5      429      8      148   
1     600       2       78     25      178      1      430      7      169   
2     519      39       93      1      167      3      422     14      161   
3     546      16       90      2      158     11      401     32      160   
4     575       5       80     17      154     16      402     31      149   

   rank6  
0     13  
1      1  
2      3  
3      4  
4     12  

[5 rows x 26 columns]

In [9]:
# Get a List of Column Headers for Reference
Column_Names = Data_CF.columns
# Print Column names into a List
[x for x in Column_Names]
Out[9]:
['athlete_ID',
 'nameURL',
 'First_Name',
 'Last_Name',
 'Region',
 'age',
 'sex&division',
 'Gender',
 'height',
 'Height_cm',
 'Weight_Orginal',
 'Weight_kg',
 ' overall-points',
 'overall-rank',
 ' score1',
 'rank1',
 ' score2',
 ' rank2',
 ' score3',
 'rank3',
 ' score4',
 'rank4',
 ' score5',
 'rank5',
 ' score6',
 'rank6']
In [10]:
# Get a subset of the Data Set of Features for Analysis
Data_CF_VA = Data_CF[['athlete_ID',
                      'First_Name',
                      'Last_Name',
                      'Region',
                      'age',
                      'Gender',
                      'Height_cm',
                      'Weight_kg',
                      ' overall-points',
                      'overall-rank',
                      ' score1',
                      'rank1',
                      ' score2',
                      ' rank2',
                      ' score3',
                      'rank3',
                      ' score4',
                      'rank4',
                      ' score5',
                      'rank5',
                      ' score6',
                      'rank6']]
In [11]:
# Get some Summary Statistics of the Data
Data_CF_VA.describe().T
Out[11]:
count mean std min 25% 50% 75% max
athlete_ID 25973 96017.548493 69823.969628 978.00 40176 81082.00 139906.00 589635
age 25972 31.686239 8.001983 12.00 26 30.00 36.00 100
Height_cm 15449 177.461691 47.709406 91.44 170 175.26 182.88 1640
Weight_kg 15493 77.818757 14.262462 0.00 68 79.00 86.00 200
overall-points 14204 23811.134328 17127.603900 11.00 9861 21056.50 35384.75 68005
overall-rank 25973 1823.984792 2460.552346 -1.00 -1 187.00 3313.00 8615
score1 25973 185.840488 98.217650 -1.00 133 205.00 260.00 448
rank1 25973 4525.945405 3975.877811 -1.00 682 3880.00 7060.00 14071
score2 25973 254.063027 142.802237 -1.00 220 289.00 341.00 7779
rank2 25973 4113.851577 3813.233792 -1.00 317 3415.00 6672.00 12842
score3 25973 21.015632 19.888825 -1.00 -1 19.00 36.00 93
rank3 25973 3377.984291 3560.604321 -1.00 -1 2359.00 5349.00 12008
score4 25973 57.439341 40.466600 -1.00 -1 69.00 90.00 178
rank4 25973 2989.617064 3298.393702 -1.00 -1 1949.00 4838.00 11222
score5 25973 148.467447 121.963669 -1.00 -1 188.00 249.00 516
rank5 25973 2615.797636 3058.748685 -1.00 -1 1349.00 4610.00 10344
score6 25973 44.325261 44.478961 -1.00 -1 50.00 85.00 169
rank6 25973 1981.245062 2584.304112 -1.00 -1 389.00 3589.00 9011
In [12]:
# Plot each feature on a history gram - Phase 1 of Methodology
Data_CF_VA_Columns = Data_CF_VA.columns
In [13]:
# Pandas Histogram Plots - Height (Transformed Variable)
Data_CF_VA['Height_cm'].hist(bins=100); 
plt.title('Height Distribution'); 
plt.ylabel('Frequency'); 
plt.xlabel('Height (cm)'); 
plt.show()
# Clearly Outliers Exists - Accomadation for this will need to be applied
In [14]:
# Pandas Histogram Plots - Age
Data_CF_VA['age'].hist(bins=24)
plt.title('Age Distribution'); plt.ylabel('# Athletes'); plt.xlabel('Age'); plt.show()
In [15]:
# Pandas Histogram Plots - Weight_kg
Data_CF_VA['Weight_kg'].hist(bins=100); 
plt.title('Weight (kg) Distribution'); 
plt.ylabel('# Athletes'); 
plt.xlabel('Weight (kilograms)'); 
plt.show()
In [16]:
# Pandas Histogram Plots - Overall Points
Data_CF_VA[Data_CF_VA_Columns[8]].hist(bins=100); 
plt.title('Final Points Distribution'); 
plt.ylabel('# Athletes'); 
plt.xlabel('Points (binned 100 into groups)'); 
plt.show()

# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
In [17]:
# Pandas Histogram Plots - Score After Week 1
Data_CF_VA[' score1'].hist(bins=50); plt.title('Week 1 Points Distribution'); 
plt.ylabel('# Athletes'); 
plt.xlabel('Points (binned 50 into groups)'); 
plt.show()

# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
In [18]:
# Pandas Histogram Plots - Score After Week 2
Data_CF_VA[' score2'].hist(bins=50); 
plt.title('Week 2 Points Distribution'); 
plt.ylabel('# Athletes'); 
plt.xlabel('Points (binned 50 into groups)'); 
plt.show()

# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
In [19]:
# Pandas Histogram Plots - Score After Week 3
Data_CF_VA[' score3'].hist(bins=50); 
plt.title('Week 3 Points Distribution'); 
plt.ylabel('# Athletes'); 
plt.xlabel('Points (binned 50 into groups)'); 
plt.show()

# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
In [20]:
# Pandas Histogram Plots - Score After Week 4
Data_CF_VA[' score4'].hist(bins=50); 
plt.title('Week 4 Points Distribution'); 
plt.ylabel('# Athletes'); 
plt.xlabel('Points (binned 50 into groups)'); 
plt.show()

# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
In [21]:
# Pandas Histogram Plots - Score After Week 5
Data_CF_VA[' score5'].hist(bins=50); 
plt.title('Week 5 Points Distribution'); 
plt.ylabel('# Athletes'); 
plt.xlabel('Points (binned 50 into groups)'); 
plt.show()

# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
In [22]:
# Pandas Histogram Plots - Score After Week 5.1
Data_CF_VA[' score6'].hist(bins=50); 
plt.title('Week 5.1 Points Distribution'); 
plt.ylabel('# Athletes'); 
plt.xlabel('Points (binned 50 into groups)'); 
plt.show()

# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0
In [23]:
# Phase 1 Conclusions

#
# Age - The histogram plot conveys a distribution that I would expect, whereby its heavily dominated by the number of 
#        athletes in the age of 20-40.
#        No requirement to normalise the data yet or outlier removal since the data is distributed as expected.
#        
# Height - The histogram plot clearly indentifies a number of athletes that have erroneous/incorrect values for their height.
#         Outliers will need to be removed and then the plot will need to be reassesed
#         Transform in the form of normalisation or standardisation may be required.
#
# Weight - The plot appears to be well distributed amoungst the bins, when changeing the number of bins it can be noted that 
#         there are some outliers - removal required since they appear to be dubious. <50kg and >160kg
#         
# Overall Points - There appears to be some clear distictions in the distbution
#                 1 - A high peak and clear group can be seen where the number of points <500
#                 2 - Another group from about 500-3000 points
#                 3 - The final set where the number of points >3000
#                 
# Week 1-5 Scores - The score plots consider all possible values.
#                  When considering and comparing groups consideration of the missing values, where the score is 0, need to be
#                  accounted for.
#                  The number of individuals entering appears to be decreasing from Week 1 to Week 5.
#                  Further analysis will be need to understand if this really is the case
In [24]:
# Phase 1.1 - Adjustments to Features based on the Conclusions mentioned above

# Dealing with Height

# Height - The tallest man alive is 251cm - Sultan Kösen (Turkey, b.10 December 1982)
# http://www.guinnessworldrecords.com/world-records/tallest-man-living
Height_Rule = Data_CF_VA.Height_cm < 251

Data_CF_VA['Height_Gr_251'] = Data_CF_VA.Height_cm < 251

# Frequency TABLE
pd.Series(Data_CF_VA['Height_Gr_251']).value_counts()

# The output below indicate the number of usable instances where the weight can be included as part of the analysis
Out[24]:
True     15409
False    10564
dtype: int64
In [25]:
# Define a function to create the central tendency about a CI
def confid_int_plot(point, ci, y, color, label):
    plot(ci, [y, y], "-", color=color, linewidth=4, label=label)
    plot(point, y, "o", color=color, markersize=10)
    
# Position on the y-axis where the INterval will be plotted
int_y = 500
In [26]:
# Calculate some statistics
d = Data_CF_VA[Data_CF_VA.Height_cm < 251].Height_cm
m = d.mean()
s = d.std()
 
# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d.values, 50)

# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")

# Add Labelling and Legends on to the Plot
plt.title('Height Distribution') 
plt.ylabel('# Athletes')
plt.xlabel('Height (cm)')
plt.legend(loc="best")
plt.show()
In [27]:
# Adjustment to the Weight Feature to Remove the Erroneous Values

# Calculate some statistics
d = Data_CF_VA.Weight_kg
m = d.mean()
s = d.std()

# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)

# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")

# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution') 
plt.ylabel('# Athletes')
plt.xlabel('Weight (kg)')
plt.legend(loc="best")
plt.show()
In [28]:
# Get the scores into a seperate Dataframe
Data_CF_VA_Scores = Data_CF_VA[[' score1',
                                ' score2',
                                ' score3',
                                ' score4',
                                ' score5',
                                ' score6']]
In [29]:
# Convert all the -1 to NAN values
Data_CF_VA_Scores[Data_CF_VA_Scores == -1] = np.nan
In [30]:
# Show Participation for Each Score Submitted
Number_Null = []; Number_Non_Null = []; DF_Length = Data_CF_VA_Scores.shape[0]

# Loop through each column to count values
for col in Data_CF_VA_Scores:
    Values = Data_CF_VA_Scores[col].count()
    Number_Non_Null.append(Values)
    Number_Null.append(DF_Length - Values)

# Convert List to Tuple for Plotting
def totuple(a):
    try:
        return tuple(totuple(i) for i in a)
    except TypeError:
        return a

# Convert Ranges to Tuples
Number_Non_Null = totuple(Number_Non_Null)
Number_Null = totuple(Number_Null)

ind = np.arange(len(Number_Non_Null))    # the x locations for the groups
width = 0.35       # the width of the bars: can also be len(x) sequence

# Plot Each Type
p1 = plt.bar(ind, Number_Non_Null, width, color='g')
p2 = plt.bar(ind, Number_Null, width, color='y',bottom=Number_Non_Null)

# Annotate the Chart
plt.ylabel('Number of Athletes Participating')
plt.title('Participation per Week')
plt.xticks(ind+width/2., ('Week1', 'Week2', 'Week3', 'Week4', 'Week5', 'Week6') )
plt.legend( (p1[0], p2[0]), ('Score Submitted', 'No Score Submitted'), loc="best")
plt.show()

# There appears to be a consistent drop off in the number of scores submitted - investigate this in terms of 
# Percentage change
In [31]:
# Calculate Percentage Changes per Week
def percent_change(old, new):
    change = new - old
    percentage_change = (change / float(old))
    return percentage_change * 100

Number_Non_Null_Change = []
for i in ind:
    if i+1 > ind.max():
        break
    else:
        Number_Non_Null_Change.append(percent_change(Number_Non_Null[i], Number_Non_Null[i+1]))
        
Number_Non_Null_Change = totuple(Number_Non_Null_Change)

# Plot Each Type
p1 = plt.bar(np.arange(len(Number_Non_Null_Change)), Number_Non_Null_Change, width, color='r')

# Annotate the Chart
plt.ylabel('% Change of the Number of Athletes Participating')
plt.ylim( (-20, 20) )
plt.title('Participation per Week')
plt.xticks(ind+width/2., ('Week1 - Week2', 'Week2 - Week3', 'Week3 - Week4', 'Week4 - Week5', 'Week5 - Week6'), rotation=45)
plt.legend(labels = 'Week on Week Change')
plt.show()

# The biggest drops between weeks occurred between 2-3, 4-5 and 5-6.
/usr/lib/pymodules/python2.7/matplotlib/axes.py:4747: UserWarning: No labeled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labeled objects found. "

In [32]:
# Revisulise the with the Null Values Excluded

# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0

# Calculate some statistics
d = Data_CF_VA_Scores[' score1']
m = d.mean()
s = d.std()

# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)

# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")

# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution') 
plt.ylabel('# Athletes'); 
plt.xlabel('Points (binned 50 into groups)'); 
plt.legend(loc="best")
plt.show()

# No adjustment since the distribution of the data in the majority is within the CI
In [33]:
# Revisulise the with the Null Values Excluded

# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0

# From the Previous visulisation it can be seen that there exist some outliers - the Green bar indicates a Confidence 
# of 4 Standard deviations from the mean

# Calculate some statistics
d = Data_CF_VA_Scores[' score2']
m = d.mean()
s = d.std()

# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)

# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")

# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution') 
plt.ylabel('# Athletes'); 
plt.xlabel('Points (binned 50 into groups)'); 
plt.legend(loc="best")
plt.show()

# I will remove those outside this range and re-plot
Score_2_Rule = (d <= m + 4*s) & (d >= m - 4*s)

# Subset using the rule above
d = d[Score_2_Rule]

# Calculate some statistics
m = d.mean()
s = d.std()

# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)

# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")

# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution') 
plt.ylabel('# Athletes'); 
plt.xlabel('Points (binned 50 into groups)'); 
plt.legend(loc="best")
plt.show()

# The resulting Plots now corrects alot of the observed problems of the dataset
In [34]:
# Revisulise the with the Null Values Excluded

# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0

# Calculate some statistics
d = Data_CF_VA_Scores[' score3']
m = d.mean()
s = d.std()

# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)

# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")

# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution') 
plt.ylabel('# Athletes'); 
plt.xlabel('Points (binned 50 into groups)'); 
plt.legend(loc="best")
plt.show()

# No adjustment since the distribution of the data in the majority is within the CI
In [35]:
# Revisulise the with the Null Values Excluded

# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0

# Calculate some statistics
d = Data_CF_VA_Scores[' score4']
m = d.mean()
s = d.std()

# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)

# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")

# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution') 
plt.ylabel('# Athletes'); 
plt.xlabel('Points (binned 50 into groups)'); 
plt.legend(loc="best")
plt.show()

# No adjustment since the distribution of the data in the majority is within the CI
# Where the data is not - it will be excused as excluding them it would mean that the "Top" Scores would be removed
In [36]:
# Revisulise the with the Null Values Excluded

# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0

# Calculate some statistics
d = Data_CF_VA_Scores[' score5']
m = d.mean()
s = d.std()

# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)

# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")

# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution') 
plt.ylabel('# Athletes'); 
plt.xlabel('Points (binned 50 into groups)'); 
plt.legend(loc="best")
plt.show()

# No adjustment since the distribution of the data in the majority is within the CI
In [37]:
# Revisulise the with the Null Values Excluded

# Its important to note that the 'Top' performing athelets are atheletes with the lowest scores <> 0

# Calculate some statistics
d = Data_CF_VA_Scores[' score6']
m = d.mean()
s = d.std()

# Pandas Histogram Plots - Height (Transformed Variable)
plt.hist(d[pd.notnull(d)].values, 50)

# Add the Confidence Interval
confid_int_plot(m, [m - s*4, m + s*4], int_y, colors[1], "4 Standard Deviations Interval")

# Add Labelling and Legends on to the Plot
plt.title('Weight Distribution') 
plt.ylabel('# Athletes'); 
plt.xlabel('Points (binned 50 into groups)'); 
plt.legend(loc="best")
plt.show()

# No adjustment since the distribution of the data in the majority is within the CI
In [38]:
# Apply All of the Changes to the Dataset and obtain Summary Statistics

# An Overview of Changes to Apply to the Main Dataset - Data_CF_VA

# 1  -  Change -1 values to NAN
# 2  -  Apply the Score2 Rule to exclude the outliers
# 3  -  Remove the Erroneous Height Values
# 4  -  Remove NAN Overall Points

# 1  -  Change -1 values to NAN
for name in Data_CF_VA_Columns:
    # Replace only those columns that contain score in the label
    if name.startswith(' score'):
        Data_CF_VA[name].replace('-1', value = np.nan, inplace = True)
        
# 2  -  Apply the Score2 Rule to exclude the outliers
# Calculate some statistics
d = Data_CF_VA[' score2']
m = d.mean()
s = d.std()

# Rule to Remove +- 4 Standard Deviations from the Dataset
Score_2_Rule = (d <= m + 4*s) & (d >= m - 4*s)

# Exclude values from the Dataset
Data_CF_VA = Data_CF_VA[Score_2_Rule]

# 3  -  Remove the Erroneous Height Values 
Height_Rule = (Data_CF_VA.Height_cm < 251)

# Exclude values from the Dataset
Data_CF_VA = Data_CF_VA[Height_Rule]

# 4  -  Remove NAN Overall Points
Rule4 = pd.notnull(Data_CF_VA[' overall-points'])

Data_CF_VA =  Data_CF_VA[Rule4]

# 5  -  Remove nan Weight Values
Rule5 = Data_CF_VA['Weight_kg'] >= 0

Data_CF_VA =  Data_CF_VA[Rule5]
In [39]:
# Remaining Dataset Size after Outlier and Spurious Data values
print('Remaining Dataset Size\n\nNumber of Rows:     %d\nNumber of Features: %d') % (Data_CF_VA.shape[0], Data_CF_VA.shape[1])
Remaining Dataset Size

Number of Rows:     9447
Number of Features: 23

In [40]:
# PHASE 2 - From Methodology

# Investigate relationships between Features

# All split by Gender - Splitting them by Gender
Data_CF_VA.Gender.value_counts()

# Proposed Investigation

# Score - Age
# Score - Overall Rank
# Score - Weight
# Region - Score - Box plots
# Weight - Height - Coloured by Overall Rank
Out[40]:
M    6966
F    2481
dtype: int64
In [41]:
# Get Male and Female Datasets
Data_CF_VA_Male = Data_CF_VA[Data_CF_VA.Gender == 'M']
Data_CF_VA_Female = Data_CF_VA[Data_CF_VA.Gender == 'F']
In [42]:
# Define Colours - Female
colorField = Data_CF_VA_Female[' overall-points'].as_matrix()

cm = np.array(["#C7E9B4", "#7FCDBB", "#41B6C4", "#1D91C0", "#225EA8", "#0C2C84"])
ix = ((colorField-colorField.min())/(colorField.max()-colorField.min())*5).astype('int')
colorsF = cm[ix]

# Define Colours - Male
colorField = Data_CF_VA_Female[' overall-points'].as_matrix()

cm = np.array(["#C7E9B4", "#7FCDBB", "#41B6C4", "#1D91C0", "#225EA8", "#0C2C84"])
ix = ((colorField-colorField.min())/(colorField.max()-colorField.min())*5).astype('int')
colorsM = cm[ix]
In [43]:
# Scatter Plots -> Score - Age

# Define Ranges
y = Data_CF_VA_Male[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Male['age'].astype(int).as_matrix()

TOOLS="resize,crosshair,pan,wheel_zoom,box_zoom,reset,tap,previewsave,box_select,poly_select,lasso_select"

# Create a figure
p1 = figure(tools=TOOLS, 
           title="Overall Points v Age - Male",
           toolbar_location="left",
           plot_width = 800, 
           plot_height= 800)

# Scatter plot creation
p1.scatter(x,
          y,
          fill_color=colorsM,
          fill_alpha=0.6, 
          line_color=None)

# Show plot in Ipython Notebook
show(p1)
In [44]:
# Define Ranges
y = Data_CF_VA_Female[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Female['age'].astype(int).as_matrix()

# Create a figure
p2 = figure(tools=TOOLS, 
           title="Overall Points v Age - Female",
           toolbar_location="left",
           plot_width = 800, 
           plot_height= 800)

# Scatter plot creation
p2.scatter(x,
          y,
          fill_color=colorsF,
          fill_alpha=0.6, 
          line_color=None)

# Show plot in Ipython Notebook
show(p2)
In [45]:
# Scatter Plots -> Score - Overall Rank

# Define Ranges
y = Data_CF_VA_Male[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Male[' score1'].astype(int).as_matrix()

# Create a figure
p3 = figure(tools=TOOLS, 
           title="Overall Points v Score1 - Male",
           toolbar_location="left",
           plot_width = 800, 
           plot_height= 800)
           
# Scatter plot creation
p3.scatter(x,
          y,
          fill_color=colorsM,
          fill_alpha=0.6, 
          line_color=None)

# Show plot in Ipython Notebook
show(p3)
In [46]:
# Define Ranges
y = Data_CF_VA_Female[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Female[' score1'].astype(int).as_matrix()

# Create a figure
p4 = figure(tools=TOOLS, 
           title="Overall Points v Score1 - Female",
           toolbar_location="left",
           plot_width = 800, 
           plot_height= 800)

# Scatter plot creation
p4.scatter(x,
          y,
          fill_color=colorsF,
          fill_alpha=1, 
          line_color=None)

# Show plot in Ipython Notebook
show(p4)
In [47]:
# Scatter Plots -> Score - Weight

# Define Ranges
y = Data_CF_VA_Male[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Male[' score2'].astype(int).as_matrix()

# Create a figure
p5 = figure(tools=TOOLS, 
           title="Overall Points v Score2 - Male",
           toolbar_location="left",
           plot_width = 800, 
           plot_height= 800)

# Scatter plot creation
p5.scatter(x,
          y,
          fill_color=colorsM,
          fill_alpha=0.6, 
          line_color=None)

# Show plot in Ipython Notebook
show(p5)
In [48]:
# Define Ranges
y = Data_CF_VA_Female[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Female[' score2'].astype(int).as_matrix()

# Create a figure
p6 = figure(tools=TOOLS, 
           title="Overall Points v Score2 - Female",
           toolbar_location="left",
           plot_width = 800, 
           plot_height= 800)

# Scatter plot creation
p6.scatter(x,
          y,
          fill_color=colorsF,
          fill_alpha=1, 
          line_color=None)

# Show plot in Ipython Notebook
show(p6)
In [49]:
# Scatter Plots -> Score - Weight

# Define Ranges
y = Data_CF_VA_Male[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Male[' score3'].astype(int).as_matrix()

# Create a figure
p7 = figure(tools=TOOLS, 
           title="Overall Points v Score3 - Male",
           toolbar_location="left",
           plot_width = 800, 
           plot_height= 800)

# Scatter plot creation
p7.scatter(x,
          y,
          fill_color=colorsM,
          fill_alpha=0.6, 
          line_color=None)

# Show plot in Ipython Notebook
show(p7)
In [50]:
# Define Ranges
y = Data_CF_VA_Female[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Female[' score3'].astype(int).as_matrix()

# Create a figure
p8 = figure(tools=TOOLS, 
           title="Overall Points v Score3 - Female",
           toolbar_location="left",
           plot_width = 800, 
           plot_height= 800)

# Scatter plot creation
p8.scatter(x,
          y,
          fill_color=colorsF,
          fill_alpha=1, 
          line_color=None)

# Show plot in Ipython Notebook
show(p8)
In [51]:
# Scatter Plots -> Score - Weight

# Define Ranges
y = Data_CF_VA_Male[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Male[' score4'].astype(int).as_matrix()

# Create a figure
p9 = figure(tools=TOOLS, 
           title="Overall Points v Score4 - Male",
           toolbar_location="left",
           plot_width = 800, 
           plot_height= 800)

# Scatter plot creation
p9.scatter(x,
          y,
          fill_color=colorsM,
          fill_alpha=0.6, 
          line_color=None)

# Show plot in Ipython Notebook
show(p9)
In [52]:
# Scatter Plots -> Score - Weight

# Define Ranges
y = Data_CF_VA_Male[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Male[' score5'].astype(int).as_matrix()

# Create a figure
p12 = figure(tools=TOOLS, 
           title="Overall Points v Score5 - Male",
           toolbar_location="left",
           plot_width = 800, 
           plot_height= 800)

# Scatter plot creation
p12.scatter(x,
          y,
          fill_color=colorsM,
          fill_alpha=0.6, 
          line_color=None)

# Show plot in Ipython Notebook
show(p12)
In [53]:
# Define Ranges
y = Data_CF_VA_Female[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Female[' score4'].astype(int).as_matrix()

# Create a figure
p11 = figure(tools=TOOLS, 
           title="Overall Points v Score4 - Female",
           toolbar_location="left",
           plot_width = 800, 
           plot_height= 800)

# Scatter plot creation
p11.scatter(x,
          y,
          fill_color=colorsF,
          fill_alpha=1, 
          line_color=None)

# Show plot in Ipython Notebook
show(p11)
In [54]:
# Define Ranges
y = Data_CF_VA_Female[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Female[' score5'].astype(int).as_matrix()

# Create a figure
p13 = figure(tools=TOOLS, 
           title="Overall Points v Score5 - Female",
           toolbar_location="left",
           plot_width = 800, 
           plot_height= 800)

# Scatter plot creation
p13.scatter(x,
          y,
          fill_color=colorsF,
          fill_alpha=1, 
          line_color=None)

# Show plot in Ipython Notebook
show(p13)
In [55]:
# Scatter Plots -> Score - Weight

# Define Ranges
y = Data_CF_VA_Male[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Male[' score6'].astype(int).as_matrix()

# Create a figure
p14 = figure(tools=TOOLS, 
           title="Overall Points v Score6 - Male",
           toolbar_location="left",
           plot_width = 800, 
           plot_height= 800)

# Scatter plot creation
p14.scatter(x,
          y,
          fill_color=colorsM,
          fill_alpha=1, 
          line_color=None)

# Show plot in Ipython Notebook
show(p14)
In [56]:
# Define Ranges
y = Data_CF_VA_Female[' overall-points'].astype(int).as_matrix()
x = Data_CF_VA_Female[' score6'].astype(int).as_matrix()

# Create a figure
p15 = figure(tools=TOOLS, 
           title="Overall Points v Score6 - Female",
           toolbar_location="left",
           plot_width = 800, 
           plot_height= 800)

# Scatter plot creation
p15.scatter(x,
          y,
          fill_color=colorsF,
          fill_alpha=1, 
          line_color=None)

# Show plot in Ipython Notebook
show(p15)
In [57]:
# Scatter Plots -> Weight - Height

# Define Ranges
y = Data_CF_VA_Male['Height_cm'].astype(int).as_matrix()
x = Data_CF_VA_Male['Weight_kg'].astype(int).as_matrix()

# Create a figure
p16 = figure(tools=TOOLS, 
           title="Weight v height - Male",
           toolbar_location="left",
           plot_width = 800, 
           plot_height= 800)

# Scatter plot creation
p16.scatter(x,
          y,
          fill_color=colorsM,
          fill_alpha=1, 
          line_color=None)

# Show plot in Ipython Notebook
show(p16)
In [58]:
# Define Ranges
y = Data_CF_VA_Female['Height_cm'].astype(int).as_matrix()
x = Data_CF_VA_Female['Weight_kg'].astype(int).as_matrix()

# Create a figure
p17 = figure(tools=TOOLS, 
           title="Weight v Height - Female",
           toolbar_location="left",
           plot_width = 800, 
           plot_height= 800)

# Scatter plot creation
p17.scatter(x,
          y,
          fill_color=colorsF,
          fill_alpha=1, 
          line_color=None)

# Show plot in Ipython Notebook
show(p17)
In [59]:
# PHASE 3 - Describing the relationships

# Get the scores into a seperate Dataframe
Data_CF_VA_Scores_Male = Data_CF_VA_Male[['overall-rank',
                                                ' score1',
                                                ' score2',
                                                ' score3',
                                                ' score4',
                                                ' score5',
                                                ' score6',
                                                'Weight_kg',
                                                'age',
                                                'Height_cm']]

Data_CF_VA_Scores_Female = Data_CF_VA_Female[['overall-rank',
                                                ' score1',
                                                ' score2',
                                                ' score3',
                                                ' score4',
                                                ' score5',
                                                ' score6',
                                                'Weight_kg',
                                                'age',
                                                'Height_cm']]

Actual_Data_CF_VA_Scores_Male = Data_CF_VA_Scores_Male.copy()
Actual_Data_CF_VA_Scores_Female = Data_CF_VA_Scores_Female.copy()
In [60]:
# Correlation Plot - Male
seaborn.corrplot(Data_CF_VA_Scores_Male.dropna())
Out[60]:
<matplotlib.axes.AxesSubplot at 0x7f71c659e350>
In [61]:
# Correlation Plot - Female
seaborn.corrplot(Data_CF_VA_Scores_Female.dropna())
Out[61]:
<matplotlib.axes.AxesSubplot at 0x7f71c6082250>
In [62]:
# PHASE 4 - Exploring the findings iteratively through interaction of Computational Techniques
In [63]:
# Autoencoder - Dimensionality Reduction

# Prepare the Data - Normalise
# Train the Neural Network
# Reconstruct to Test Performance of the Autoencoder
# Encoder to get two features

# Export Data to R and Use h2o with Tableau
In [64]:
# An overview of Autoencoder

# Reasoning for use:

# Advantage of using PCA is restricted to the linearity assumption, whereas an auto encoders can have nonlinear enoder/decoders.

print('An Example of an Autoencoder is shown below\n')

print('In order to use this Autoencoder the following steps are:\n')

print('1 - Training the Networks - where the Input layer is the same as the output layer\n')
print('2 - Access the Reconstruction Error - Ensure the error is low before moving on\n')
print('3 - Encoding - Encoder the data on both the Male and Females datasets')
An Example of an Autoencoder is shown below

In order to use this Autoencoder the following steps are:

1 - Training the Networks - where the Input layer is the same as the output layer

2 - Access the Reconstruction Error - Ensure the error is low before moving on

3 - Encoding - Encoder the data on both the Male and Females datasets

In [65]:
from IPython.display import Image
Image(filename='autoencoder.png')
Out[65]:
In [66]:
# Denoising Autoencoder - To be used for Dimensionality Reduction

# Denoising Autoencoders (dA)
#
# References :
#   - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and
#   Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103,
#   2008
#
#   - DeepLearningTutorials
#   https://github.com/lisa-lab/DeepLearningTutorials
#   
#   - Yusuke Sugomori: Stochastic Gradient Descent for Denoising Autoencoders,
# http://yusugomori.com/docs/SGD_DA.pdf
 
 
import sys
import numpy
 
 
numpy.seterr(all='ignore')
 
def sigmoid(x):
    return 1. / (1 + numpy.exp(-x))
 
 
class dA(object):
    def __init__(self, input=None, n_visible=2, n_hidden=3, \
        W=None, hbias=None, vbias=None, numpy_rng=None):
 
        self.n_visible = n_visible  # num of units in visible (input) layer
        self.n_hidden = n_hidden    # num of units in hidden layer
 
        if numpy_rng is None:
            numpy_rng = numpy.random.RandomState(1234)
            
        if W is None:
            a = 1. / n_visible
            initial_W = numpy.array(numpy_rng.uniform(  # initialize W uniformly
                low=-a,
                high=a,
                size=(n_visible, n_hidden)))
 
            W = initial_W
 
        if hbias is None:
            hbias = numpy.zeros(n_hidden)  # initialize h bias 0
 
        if vbias is None:
            vbias = numpy.zeros(n_visible)  # initialize v bias 0
 
        self.numpy_rng = numpy_rng
        self.x = input
        self.W = W
        self.W_prime = self.W.T
        self.hbias = hbias
        self.vbias = vbias
 
        # self.params = [self.W, self.hbias, self.vbias]
 
 
        
    def get_corrupted_input(self, input, corruption_level):
        assert corruption_level < 1
 
        return self.numpy_rng.binomial(size=input.shape,
                                       n=1,
                                       p=1-corruption_level) * input
 
    # Encode
    def get_hidden_values(self, input):
        return sigmoid(numpy.dot(input, self.W) + self.hbias)
 
    # Decode
    def get_reconstructed_input(self, hidden):
        return sigmoid(numpy.dot(hidden, self.W_prime) + self.vbias)
 
 
    def train(self, lr=0.1, corruption_level=0.3, input=None):
        if input is not None:
            self.x = input
 
        x = self.x
        tilde_x = self.get_corrupted_input(x, corruption_level)
        y = self.get_hidden_values(tilde_x)
        z = self.get_reconstructed_input(y)
 
        L_h2 = x - z
        L_h1 = numpy.dot(L_h2, self.W) * y * (1 - y)
 
        L_vbias = L_h2
        L_hbias = L_h1
        L_W =  numpy.dot(tilde_x.T, L_h1) + numpy.dot(L_h2.T, y)
 
 
        self.W += lr * L_W
        self.hbias += lr * numpy.mean(L_hbias, axis=0)
        self.vbias += lr * numpy.mean(L_vbias, axis=0)
 
 
 
    def negative_log_likelihood(self, corruption_level=0.3):
        tilde_x = self.get_corrupted_input(self.x, corruption_level)
        y = self.get_hidden_values(tilde_x)
        z = self.get_reconstructed_input(y)
 
        cross_entropy = - numpy.mean(
            numpy.sum(self.x * numpy.log(z) +
            (1 - self.x) * numpy.log(1 - z),
                      axis=1))
 
        return cross_entropy
 
 
    def reconstruct(self, x):
        y = self.get_hidden_values(x)
        z = self.get_reconstructed_input(y)
        return z
 
 
 
def test_dA(learning_rate=0.1, corruption_level=0.1, training_epochs=500):
    data = numpy.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1],
                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1],
                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1],
                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0]])
    
    rng = numpy.random.RandomState(123)
 
    # construct dA
    da = dA(input=data, n_visible=20, n_hidden=2, numpy_rng=rng)
 
    # train
    for epoch in xrange(training_epochs):
        da.train(lr=learning_rate, corruption_level=corruption_level)
        # cost = da.negative_log_likelihood(corruption_level=corruption_level)
        # print >> sys.stderr, 'Training epoch %d, cost is ' % epoch, cost
        # learning_rate *= 0.95
 
 
    # test
    x = numpy.array([[1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
                     [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0]])
 
    print da.reconstruct(x)
    
# Main 
if __name__ == "__main__":
    #test_dA()
    print('Autoencoder Class built - Ready for testing')
Autoencoder Class built - Ready for testing

In [67]:
# Prepare the Data
data_M_numerical = Data_CF_VA_Scores_Male.as_matrix()
data_F_numerical = Data_CF_VA_Scores_Female.as_matrix()
In [68]:
# Train the Neural Network - Male

# Define Parameters
learning_rate     = 0.001
corruption_level  = 0.3
training_epochs   = 5000

# Training
Male_Autoencoder = dA(input = data_M_numerical,
                      n_visible = data_M_numerical.shape[1],
                      n_hidden = 2)
In [69]:
# Train the Neural Network - Female

# Define Parameters
learning_rate     = 0.001
corruption_level  = 0.3
training_epochs   = 5000

# Training
Female_Autoencoder = dA(input = data_F_numerical,
                      n_visible = data_F_numerical.shape[1],
                      n_hidden = 2)
In [70]:
Mcost = Male_Autoencoder.negative_log_likelihood(corruption_level=corruption_level)
print('Male Autoencoder Cost Value: %.4f') % (Mcost)

Fcost = Female_Autoencoder.negative_log_likelihood(corruption_level)
print('Female Autoencoder Cost Value: %.4f') % (Fcost)
Male Autoencoder Cost Value: -49.1153
Female Autoencoder Cost Value: -30.0500

In [71]:
# Encoding the data
Male_Autoencoder_Hidden_Values = pd.DataFrame(Male_Autoencoder.get_hidden_values(data_M_numerical),
                                              columns = ['Autoencoder_1','Autoencoder_2'])

Female_Autoencoder_Hidden_Values = pd.DataFrame(Female_Autoencoder.get_hidden_values(data_F_numerical),
                                                columns = ['Autoencoder_1','Autoencoder_2'])
In [72]:
# Plot components - To inspect what the Neural Network Calculated
Female_Autoencoder_Hidden_Values.plot(kind='hexbin', x='Autoencoder_1', y='Autoencoder_2', gridsize=5)

Male_Autoencoder_Hidden_Values.plot(kind='hexbin', x='Autoencoder_1', y='Autoencoder_2', gridsize=5)
Out[72]:
<matplotlib.axes.AxesSubplot at 0x7f71bfea3ed0>
In [73]:
# Nothing interesting can be seen from the Autoencoder - will retry later

# No normalisation was applied which may explain for the poor results
In [74]:
# Get the missing Data
Merged_Data = Data_CF[['athlete_ID',
                        'nameURL',
                        'First_Name',
                        'Last_Name',
                        'Region',
                        'sex&division',
                        'Gender']]

# Merge Data togeather - Male
Data_CF_VA_Scores_Male = Data_CF_VA_Scores_Male.merge(Merged_Data,
                                                       how = 'left',
                                                       left_index = True,
                                                       right_index = True)

# Merge Data togeather - Female
Data_CF_VA_Scores_Female = Data_CF_VA_Scores_Female.merge(Merged_Data,
                                                           how = 'left',
                                                           left_index = True,
                                                           right_index = True)
In [75]:
# Recalculate Points Total of Remaining Atheletes
Data_CF_VA_Scores_Male['Overall_Points'] = Data_CF_VA_Scores_Male[[' score1', ' score2',' score3',' score4',' score5',' score6']].sum(axis=1)

Data_CF_VA_Scores_Female['Overall_Points'] = Data_CF_VA_Scores_Female[[' score1', ' score2',' score3',' score4',' score5',' score6']].sum(axis=1)

# Create Percentiles
Data_CF_VA_Scores_Male['Percentiles100'] = pd.qcut(Data_CF_VA_Scores_Male.Overall_Points.as_matrix(), 
                                                             q = 100,
                                                             labels=False)
                                                     
Data_CF_VA_Scores_Female['Percentiles100'] = pd.qcut(Data_CF_VA_Scores_Female.Overall_Points.as_matrix(), 
                                                             q = 100,
                                                             labels=False)

Actual_Data_CF_VA_Scores_Male = Data_CF_VA_Scores_Male.copy()
Actual_Data_CF_VA_Scores_Female = Data_CF_VA_Scores_Female.copy()
In [76]:
Data_CF_VA_Scores_Male.describe().T
Out[76]:
count mean std min 25% 50% 75% max
overall-rank 6966 3873.563307 2630.937892 1.00 1500.50 3768.5 6133.50 8615.00
score1 6966 244.293138 67.779209 0.00 200.00 255.0 300.00 445.00
score2 6966 332.944875 68.750118 1.00 288.00 330.0 376.00 629.00
score3 6966 33.496411 17.333797 0.00 21.00 34.0 46.00 90.00
score4 6966 88.583979 19.393890 1.00 74.00 90.0 97.00 178.00
score5 6966 247.969423 58.465826 1.00 214.00 248.0 283.00 516.00
score6 6966 86.604938 23.527207 1.00 72.00 88.0 101.00 169.00
Weight_kg 6966 83.589004 10.374719 16.00 77.00 83.0 89.00 159.00
age 6966 31.681740 7.878159 15.00 26.00 30.0 36.00 71.00
Height_cm 6966 179.194829 7.288843 91.44 175.26 180.0 183.00 210.82
athlete_ID 6966 70315.030146 62465.519528 1462.00 20329.75 52089.5 99716.75 429940.00
Overall_Points 6966 1033.892765 223.849837 96.00 876.00 1029.0 1185.00 1865.00
Percentiles100 6966 49.442148 28.868736 0.00 24.00 49.0 74.00 99.00
In [77]:
Data_CF_VA_Scores_Female.describe().T
Out[77]:
count mean std min 25% 50% 75% max
overall-rank 2481 1887.051189 1364.549352 1.00 629.00 1775.0 3037.00 4495.00
score1 2481 221.110036 62.440000 1.00 179.00 222.0 264.00 405.00
score2 2481 323.032648 71.156632 1.00 276.00 322.0 369.00 642.00
score3 2481 27.544942 18.117151 1.00 13.00 26.0 40.00 90.00
score4 2481 77.161225 14.132231 1.00 65.00 77.0 90.00 168.00
score5 2481 223.698509 66.797722 1.00 186.00 225.0 271.00 484.00
score6 2481 70.387344 28.603383 1.00 55.00 75.0 89.00 152.00
Weight_kg 2481 61.995163 7.230478 4.00 57.00 61.0 66.00 102.00
age 2481 31.762596 7.982925 14.00 26.00 30.0 36.00 78.00
Height_cm 2481 165.419976 7.070367 121.92 160.02 165.1 170.18 187.96
athlete_ID 2481 70210.597340 61715.388270 1632.00 22455.00 52692.0 97473.00 423947.00
Overall_Points 2481 942.934704 223.362178 156.00 788.00 938.0 1092.00 1835.00
Percentiles100 2481 49.438130 28.887817 0.00 24.00 49.0 74.00 99.00
In [78]:
# Exploring the Dataset further by creating to similiarity matrices per dataset

# 1 - Similarity Matrix of [Age, Weight, Height]
# 2 - Similarity Matrix of [Score1, Score2, Score3, Score4, Score5, Score6]

# Number of Components Required
n_components = 2
In [79]:
# Convert Dataframes to Numpy Arrays

# Get Lists for Subsetting
Athele_Characteristics = ['Weight_kg', 'age', 'Height_cm']
Scores_List = [' score1', ' score2', ' score3', ' score4', ' score5', ' score6']
In [80]:
# Get Arrays - All four
AC_Male    = Data_CF_VA_Scores_Male[Athele_Characteristics]
AC_Female  = Data_CF_VA_Scores_Female[Athele_Characteristics]

SL_Male    = Data_CF_VA_Scores_Male[Scores_List]
SL_Female  = Data_CF_VA_Scores_Female[Scores_List]
In [81]:
# Using Multidimensional Scaling - Dimensionality Reduction

# Import the Module
from sklearn import manifold

# Define a Function for ease of Processing
def mds_function(X, n_components, name):
    # Time the Processing
    t0 = time.time()
    # Define the Function
    mds = manifold.MDS(n_components, max_iter=100, n_init=1)
    # Produce the Matrix of n-components
    Y = mds.fit_transform(X)
    # Time the Processing
    t1 = time.time()
    print("MDS Transformation of %s: %.2f sec" % (str(name),(t1 - t0)))
    # Return the Outputted Array
    return Y
In [82]:
# Apply the Dimensionality Reduction Technique

AC_Male_Trans = pd.DataFrame(mds_function(AC_Male.as_matrix(), 
                             n_components, 
                             'AC Male'), columns = ['Component_1_AC_Male_Trans','Component_2_AC_Male_Trans'])

AC_Female_Trans = pd.DataFrame(mds_function(AC_Female.as_matrix(), 
                               n_components, 
                               'AC Female'), columns = ['Component_1_AC_Female_Trans','Component_2_AC_Female_Trans'])

SL_Male_Trans = pd.DataFrame(mds_function(SL_Male.as_matrix(), 
                               n_components,
                               'SL_Male'), columns = ['Component_1_SL_Male_Trans','Component_2_SL_Male_Trans'])

SL_Female_Trans = pd.DataFrame(mds_function(SL_Female.as_matrix(), 
                               n_components,
                               'SL_Female'), columns = ['Component_1_SL_Female_Trans','Component_2_SL_Female_Trans'])
MDS Transformation of AC Male: 143.61 sec
MDS Transformation of AC Female: 14.71 sec
MDS Transformation of SL_Male: 146.99 sec
MDS Transformation of SL_Female: 14.63 sec

In [83]:
# Dimensionality Reduction Algorithm 2 - Autoencoder Attempt 2

# Normalise Columns - Using the Function to Loop through columns
def normalize_columns(df):
    # Iterate through each column
    for feature_name in df.columns:
        # Find the Min and Max - Use for scaling
        max_val          = df[feature_name].max()
        min_val          = df[feature_name].min()
        df[feature_name] = (df[feature_name] - min_val) / (max_val - min_val)
    # Return Dataframe Back
    return df

# Apply to Dataframe
AC_Male_Trans_2 = normalize_columns(AC_Male)
AC_Female_Trans_2 = normalize_columns(AC_Female)

SL_Male_Trans_2 = normalize_columns(SL_Male)
SL_Female_Trans_2 = normalize_columns(SL_Female)

# Train the Neural Network on Each Array

# Training
Female_Autoencoder_1 = dA(input = AC_Female_Trans_2.as_matrix(),
                      n_visible = AC_Female_Trans_2.shape[1],
                      n_hidden = 2)

# Training
Male_Autoencoder_1 = dA(input = AC_Male_Trans_2.as_matrix(),
                      n_visible = AC_Male_Trans_2.shape[1],
                      n_hidden = 2)

# Training
Female_Autoencoder_2 = dA(input = SL_Female_Trans_2.as_matrix(),
                      n_visible = SL_Female_Trans_2.shape[1],
                      n_hidden = 2)

# Training
Male_Autoencoder_2 = dA(input = SL_Male_Trans_2.as_matrix(),
                      n_visible = SL_Male_Trans_2.shape[1],
                      n_hidden = 2)
In [84]:
# Encoding the data 
AC_Male_Trans_2 = pd.DataFrame(Male_Autoencoder_1.get_hidden_values(AC_Male_Trans_2.as_matrix()),
                                              columns = ['Autoencoder_1_AC_Male_Trans_2','Autoencoder_2_AC_Male_Trans_2'])

AC_Female_Trans_2 = pd.DataFrame(Female_Autoencoder_1.get_hidden_values(AC_Female_Trans_2.as_matrix()),
                                                columns = ['Autoencoder_1_AC_Female_Trans_2','Autoencoder_2_AC_Female_Trans_2'])

SL_Male_Trans_2 = pd.DataFrame(Male_Autoencoder_2.get_hidden_values(SL_Male_Trans_2.as_matrix()),
                                              columns = ['Autoencoder_1_SL_Male_Trans_2','Autoencoder_2_SL_Male_Trans_2'])

SL_Female_Trans_2 = pd.DataFrame(Female_Autoencoder_2.get_hidden_values(SL_Female_Trans_2.as_matrix()),
                                                columns = ['Autoencoder_1_SL_Female_Trans_2','Autoencoder_2_SL_Female_Trans_2'])
In [85]:
# Plot Each of the Encodings to see if they are usable...

AC_Male_Trans_2.plot(kind='scatter', x='Autoencoder_1_AC_Male_Trans_2', y='Autoencoder_2_AC_Male_Trans_2')
plt.title('Checking if the Auto-encoder compenents are ''usable''...AC_Male_Trans_2')
plt.xlabel('Component 1')
plt.ylabel('Component 2')

AC_Female_Trans_2.plot(kind='scatter', x='Autoencoder_1_AC_Female_Trans_2', y='Autoencoder_2_AC_Female_Trans_2')
plt.title('Checking if the Auto-encoder compenents are ''usable''...AC_Female_Trans_2')
plt.xlabel('Component 1')
plt.ylabel('Component 2')

SL_Male_Trans_2.plot(kind='scatter', x='Autoencoder_1_SL_Male_Trans_2', y='Autoencoder_2_SL_Male_Trans_2')
plt.title('Checking if the Auto-encoder compenents are ''usable''...SL_Male_Trans_2')
plt.xlabel('Component 1')
plt.ylabel('Component 2')

SL_Female_Trans_2.plot(kind='scatter', x='Autoencoder_1_SL_Female_Trans_2', y='Autoencoder_2_SL_Female_Trans_2')
plt.title('Checking if the Auto-encoder compenents are ''usable''...SL_Female_Trans_2')
plt.xlabel('Component 1')
plt.ylabel('Component 2')

# From inspection each of the plots appear to be usable as a means of Dimensionality Reduction.

# Plotting and the use of other visual encoding will be applied in Tableau to ascertain if there is anything significant
Out[85]:
<matplotlib.text.Text at 0x7f71bd371790>
In [86]:
# Clustering Algorithm - Agglomerative clustering

from sklearn.cluster import AgglomerativeClustering

# X is the Numpy Array
# n is the number of Clusters

def allogmerative_clustering(X, n):
    clustering = AgglomerativeClustering(linkage='complete', n_clusters=n)
    t0 = time.time()
    clustering.fit(X)
    print("%s took: %.2f seconds" % ('Complete linkage', time.time() - t0))
    # Plot Clusters
    plt.scatter(X[:,0], X[:,1], c=clustering.labels_+100)
    # Display the plot
    plt.show()
    # Return the Labels
    return clustering.labels_
In [87]:
# Clustering Algorithm - Agglomerative clustering - MDS Datasets

AC_Male_Trans['Labels_AC_Male_Trans_MDS'] = allogmerative_clustering(AC_Male_Trans.as_matrix(), 4)
 
AC_Female_Trans['Labels_AC_Female_Trans_MDS'] = allogmerative_clustering(AC_Female_Trans.as_matrix(), 4)

SL_Male_Trans['Labels_SL_Male_Trans_MDS'] = allogmerative_clustering(SL_Male_Trans.as_matrix(), 4)

SL_Female_Trans['Labels_SL_Female_Trans_MDS'] = allogmerative_clustering(SL_Female_Trans.as_matrix(), 4)
Complete linkage took: 143.90 seconds

Complete linkage took: 5.08 seconds

Complete linkage took: 139.19 seconds

Complete linkage took: 4.87 seconds

In [88]:
# Clustering Algorithm - Agglomerative clustering - Autoencoder Datasets

# Encoding the data 
AC_Male_Trans_2['Labels_AC_Male_Trans_2_Auto'] = allogmerative_clustering(AC_Male_Trans_2.as_matrix(), 4)

AC_Female_Trans_2['Labels_AC_Female_Trans_2_Auto'] = allogmerative_clustering(AC_Female_Trans_2.as_matrix(), 4)

SL_Male_Trans_2['Labels_SL_Male_Trans_2_Auto'] = allogmerative_clustering(SL_Male_Trans_2.as_matrix(), 4)

SL_Female_Trans_2['Labels_SL_Female_Trans_2_Auto'] = allogmerative_clustering(SL_Female_Trans_2.as_matrix(), 4) 
Complete linkage took: 141.29 seconds

Complete linkage took: 5.05 seconds

Complete linkage took: 139.77 seconds

Complete linkage took: 4.93 seconds

In [89]:
# Collating Data togeather - Sharing Index

SL_Male_Trans_2.index = AC_Male_Trans_2.index = SL_Male_Trans.index = AC_Male_Trans.index = SL_Male.index = AC_Male.index
SL_Female_Trans_2.index = AC_Female_Trans_2.index = SL_Female_Trans.index = AC_Female_Trans.index = SL_Female.index = AC_Female.index
In [90]:
# Get Additional Information 

Final_Cols = Data_CF_VA_Scores_Male.columns - Athele_Characteristics - Scores_List

Data_CF_VA_Scores_Female = Data_CF_VA_Scores_Female[Final_Cols]

Data_CF_VA_Scores_Male = Data_CF_VA_Scores_Male[Final_Cols]
In [91]:
# Merge Male Datasets

# Transformed Data
Data_CF_VA_Male_Final = AC_Male.merge(SL_Male ,how = 'left', left_index = True, right_index= True)
Data_CF_VA_Male_Final = Data_CF_VA_Male_Final.merge(Data_CF_VA_Scores_Male ,how = 'left', left_index = True, right_index= True)
# Merge Original
Data_CF_VA_Male_Final = Data_CF_VA_Male_Final.merge(Actual_Data_CF_VA_Scores_Male ,how = 'left', left_index = True, right_index= True)
# Merge Labels and Dimensionality Reduced Data
Data_CF_VA_Male_Final = Data_CF_VA_Male_Final.merge(AC_Male_Trans ,how = 'left', left_index = True, right_index= True)
Data_CF_VA_Male_Final = Data_CF_VA_Male_Final.merge(SL_Male_Trans ,how = 'left', left_index = True, right_index= True)
Data_CF_VA_Male_Final = Data_CF_VA_Male_Final.merge(AC_Male_Trans_2 ,how = 'left', left_index = True, right_index= True)
Data_CF_VA_Male_Final = Data_CF_VA_Male_Final.merge(SL_Male_Trans_2 ,how = 'left', left_index = True, right_index= True)
In [92]:
# Merge Female Datasets

# Transformed Data
Data_CF_VA_Female_Final = AC_Female.merge(SL_Female ,how = 'left', left_index = True, right_index= True)
Data_CF_VA_Female_Final = Data_CF_VA_Female_Final.merge(Data_CF_VA_Scores_Female ,how = 'left', left_index = True, right_index= True)
# Merge Original
Data_CF_VA_Female_Final = Data_CF_VA_Female_Final.merge(Actual_Data_CF_VA_Scores_Female ,how = 'left', left_index = True, right_index= True)
# Merge Labels and Dimensionality Reduced Data
Data_CF_VA_Female_Final = Data_CF_VA_Female_Final.merge(AC_Female_Trans ,how = 'left', left_index = True, right_index= True)
Data_CF_VA_Female_Final = Data_CF_VA_Female_Final.merge(SL_Female_Trans ,how = 'left', left_index = True, right_index= True)
Data_CF_VA_Female_Final = Data_CF_VA_Female_Final.merge(AC_Female_Trans_2 ,how = 'left', left_index = True, right_index= True)
Data_CF_VA_Female_Final = Data_CF_VA_Female_Final.merge(SL_Female_Trans_2 ,how = 'left', left_index = True, right_index= True)
In [93]:
# Save both Datasets - Male and Female

Data_CF_VA_Male_Final.to_csv('Subsetted_Male_Dataset.csv',
                              sep = ',',
                              index = False)

Data_CF_VA_Female_Final.to_csv('Subsetted_Female_Dataset.csv',
                                  sep = ',',
                                  index = False)
In [94]:
# Time to Process
print('Time to Process Script: %.5f Minutes') % ((time.time() - start)/60)
Time to Process Script: 15.65841 Minutes

In [95]:
# PHASE 5 - Final Enhancements and Visulisations

# The final phase will involve Tableau for comparisons and presentation of results
In [96]:
Image(filename='tableau.png')
Out[96]: